In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
app = pd.read_pickle('/Users/krystal/Desktop/app_cleaned.pickle')
app.head()
Out[2]:
In [ ]:
app = app.drop_duplicates()
In [ ]:
for i in range(0,len(app)):
unit = app['size'][i][-2:]
if unit == 'GB':
app['size'][i] = float(app['size'][i][:-3])*1000
else:
app['size'][i] = float(app['size'][i][:-3])
Convert unit of app size from GB into KB.
In [22]:
rating_df = app[["name","size","overall_rating", "current_rating", 'num_current_rating', "num_overall_rating"]].dropna()
In [23]:
rating_cleaned = {'1 star':1, "1 and a half stars": 1.5, '2 stars': 2, '2 and a half stars':2.5, "3 stars":3, "3 and a half stars":3.5, "4 stars": 4,
'4 and a half stars': 4.5, "5 stars": 5}
In [24]:
rating_df.overall_rating = rating_df.overall_rating.replace(rating_cleaned)
In [25]:
rating_df['weighted_rating'] = np.divide(rating_df['num_current_rating'],rating_df['num_overall_rating'])*rating_df['current_rating']+(1-np.divide(rating_df['num_current_rating'],rating_df['num_overall_rating']))*rating_df['overall_rating']
Add variable weighted rating as app's quality into data set.
In [27]:
plt.scatter(rating_df['size'], rating_df['weighted_rating'])
plt.xlabel('Size of app')
plt.ylabel('Quality of app')
plt.title('Relationship between app size and quality')
plt.show()
In [28]:
rating_df_2 = rating_df[rating_df['size'] <= 500]
In [29]:
plt.scatter(rating_df_2['size'], rating_df_2['weighted_rating'])
plt.xlabel('Size of app')
plt.ylabel('Quality of app')
plt.title('Relationship between app size(less than 500) and quality')
plt.show()
I plot scatter plot for app size and overall rating of app. The second plot only contains app with size less than 500KB. I find that there is a positive association between app size and app overall rating. Further analysis is still needed.
In [ ]: